library(cluster)package ‘cluster’ was built under R version 3.4.4
library(mclust)package ‘mclust’ was built under R version 3.4.4 __ ___________ __ _____________
/ |/ / ____/ / / / / / ___/_ __/
/ /|_/ / / / / / / / /\__ \ / /
/ / / / /___/ /___/ /_/ /___/ // /
/_/ /_/\____/_____/\____//____//_/ version 5.4.1
Type 'citation("mclust")' for citing this R package in publications.
library(lattice)
library(tidyverse)package ‘tidyverse’ was built under R version 3.4.2[37m── [1mAttaching packages[22m ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──[39m
[37m[32m✔[37m [34mggplot2[37m 2.2.1.[31m9000[37m [32m✔[37m [34mpurrr [37m 0.2.4
[32m✔[37m [34mtibble [37m 1.4.2 [32m✔[37m [34mdplyr [37m 0.7.5
[32m✔[37m [34mtidyr [37m 0.8.1 [32m✔[37m [34mstringr[37m 1.3.1
[32m✔[37m [34mreadr [37m 1.1.1 [32m✔[37m [34mforcats[37m 0.3.0 [39m
package ‘tibble’ was built under R version 3.4.3package ‘purrr’ was built under R version 3.4.2package ‘forcats’ was built under R version 3.4.3[37m── [1mConflicts[22m ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[37m [34mdplyr[37m::[32mfilter()[37m masks [34mstats[37m::filter()
[31m✖[37m [34mdplyr[37m::[32mlag()[37m masks [34mstats[37m::lag()
[31m✖[37m [34mpurrr[37m::[32mmap()[37m masks [34mmclust[37m::map()[39m
library(maptree)Loading required package: rpart
package ‘rpart’ was built under R version 3.4.3
library(mice)package ‘mice’ was built under R version 3.4.4
Attaching package: ‘mice’
The following object is masked from ‘package:tidyr’:
complete
The following objects are masked from ‘package:base’:
cbind, rbind
library(magrittr)
Attaching package: ‘magrittr’
The following object is masked from ‘package:purrr’:
set_names
The following object is masked from ‘package:tidyr’:
extract
library(mclust)
library(mice)
library(nFactors)Loading required package: MASS
Attaching package: ‘MASS’
The following object is masked from ‘package:dplyr’:
select
Loading required package: psych
Attaching package: ‘psych’
The following objects are masked from ‘package:ggplot2’:
%+%, alpha
The following object is masked from ‘package:mclust’:
sim
Loading required package: boot
package ‘boot’ was built under R version 3.4.1
Attaching package: ‘boot’
The following object is masked from ‘package:psych’:
logit
The following object is masked from ‘package:lattice’:
melanoma
Attaching package: ‘nFactors’
The following object is masked from ‘package:lattice’:
parallel
library(poLCA)Loading required package: scatterplot3d
package ‘scatterplot3d’ was built under R version 3.4.4
library(Rtsne)
library(corrplot)package ‘corrplot’ was built under R version 3.4.2corrplot 0.84 loaded
utility_histogramplotter <- function(x) {
lattice::histogram( ~ x)
}
seg.summ <- function(data, groups) {
aggregate(data, list(groups), function(x) mean(as.numeric(as.character(x))))
}
minmax <- function(x)
{
return((x- min(x)) /(max(x)-min(x)))
}
plot_bi_plots <- function(df, y,...) {
old.par = par()
par(mar=c(5,4,5,2))
on.exit(expr = 'par = old.par')
prcomp(df) %>% biplot(col = c('gray', 'red'), cex = .8, main = y, ...)
}
adjust_q13 <- function(df){
df %<>%
mutate(q13_visitfreq_social= (q13r1+q13r2+q13r3 +q13r11)/4,
q13_visitfreq_music = (q13r4+q13r7+q13r8 +q13r9)/4,
q13_visitfreq_video = (q13r5+q13r6+q13r10+q13r12)/4)
df[,!str_detect(names(df),'q13r')]
}
adjust_q24 <- function(df){
df %<>%
mutate(
q24_tech_posatt = (q24r1+q24r2+q24r3+q24r5+q24r6)/5,
q24_tech_enter = (q24r7+q24r8)/2,
q24_tech_comm = (q24r10+q24r11+q24r12)/3,
q24_tech_negatv = (q24r4+q24r9)/2
)
df[,!str_detect(names(df),'q24r')]
}
adjust_q25 <- function(df){
df %<>%
mutate(
q25_prsnlty_leader = (q25r1+q25r2+q25r3+q25r4)/4,
q25_prsnlty_risk = (q25r5+q25r7+q25r8)/3,
q25_prsnlty_drive = (q25r9+q25r10+q25r11+q25r12)/4,
q25_prsnlty_follower = q25r6
)
df[,!str_detect(names(df),'q25r')]
}
adjust_q26 <- function(df){
df %<>%
mutate(
q26_shopsavvy_bargain = (q26r3+q26r5)/2,
q26_shopsavvy_brands = (q26r18+q26r7+q26r13+q26r14+q26r15)/5,
q26_shopsavvy_earn2spend = (q26r13+q26r16+q26r4)/3,
q26_shopsavvy_applover = (q26r17+q26r12+q26r10+q26r8+q26r6+q26r9)/6,
q26_shopsavvy_children = q26r11,
)
df[,!str_detect(names(df),'q26r')]
}
adjust_q2 <- function(df){
df %<>%
mutate(
q2_apple = ifelse(q2r1==1|q2r2==1,1,0),
q2_andriod = q2r3,
q2_windows = q2r6,
q2_tablet = q2r8,
q2_other = ifelse(q2r4==1|q2r5==1|q2r7==1|q2r9==1,1,0)
)
df[,!str_detect(names(df),'q2r')]
}
adjust_q4 <- function(df){
df %<>%
mutate(
q4_use_music_apps =q4r1,
q4_use_tv_apps =ifelse(q4r2==1|q4r3==1|q4r4==1,1,0),
q4_use_game_apps =q4r5,
q4_use_social_apps =q4r6,
q4_use_news_apps =ifelse(q4r7==1|q4r9==1,1,0),
q4_use_shop_apps =q4r8,
q4_use_none_apps =q4r11
)
df[,!str_detect(names(df),'q4r')]
}
adjust_race <- function(df){
df %>%
mutate(
q54_white = ifelse(q54==1,1,0),
q54_black = ifelse(q54==2,1,0),
q54_asian = ifelse(q54==3,1,0),
q54_hawai = ifelse(q54==4,1,0),
q54_native = ifelse(q54==5,1,0),
q54_other = ifelse(q54==6,1,0),
q55_latino = ifelse(q55==1,1,0)
) %>%
dplyr::select(-q54,-q55)
}
adjust_gender <- function(df){
df %>%
mutate(q57 = ifelse(q57==1,1,0))
}
adjust_marital <- function(df){
df %>%
mutate(q49 = ifelse(q49==1,1,0))
}
adjust_q11 <- function(df){
#Q11 has artificial ordinality between #of apps increasing, and response==5 as "Dont know"
#This function resolves this problem
df %>%
mutate(q11 = ifelse(q11==5 | q11==6, 0, q11))
}
adjust_children <- function(df){
df %>%
dplyr::select(-q50r2_inftod,-q50r3_6_12,-q50r4_13_17,-q50r5_adult)
}
adjust_income <- function(df){
df %>%
dplyr::mutate(q56 = case_when(
q56 <= 4 ~ 1,
q56 >= 5 & q56 <=8 ~ 2,
q56 >= 9 & q56 <=11 ~ 3,
q56 >= 12 & q56 <= 13 ~ 4,
q56 >= 14 ~ 5
))
}
adjust_age <- function(df){
df %>%
dplyr::mutate(
q1 = case_when(
q1 <= 2 ~ 1,
q1 >= 3 & q1 <= 5 ~ 2,
q1 >= 6 & q1 <= 8 ~ 3,
q1 >= 9 & q1 <= 11 ~ 4,
q1 >= 12 ~ 5,
)
)
}
adjust_names <- function(df){
df %>%
dplyr::rename(
q1_age=q1,
q11_appnum = q11,
q12_freeapppc = q12,
q48_edu = q48,
q49_marital = q49,
q56_income = q56,
q57_mf = q57,
q50r1_nochild = q50r1,
q50r2_inftod = q50r2,
q50r3_6_12 = q50r3,
q50r4_13_17 = q50r4,
q50r5_adult = q50r5
)
}load('../data/apphappyData.RData')
df_labs <- tbl_df(apphappy.3.labs.frame)
df_nums <- tbl_df(apphappy.3.num.frame)
dim(df_nums)[1] 1800 89
rm(apphappy.3.num.frame); rm(apphappy.3.labs.frame)
df_labs$q57 <- as.factor(df_labs$q57)
df_labs$caseID <- NULL
df_nums$caseID <- NULL
df_labs$q2r10 <- NULL
df_nums$q2r10 <- NULL
df_labs$q5r1 <- NULL
df_nums$q5r1 <- NULLdf_nums$q12[is.na(df_nums$q12)] <- 0
df_nums %>% xtabs(~q12+q11,.,addNA = T) q11
q12 1 2 3 4 5 6
0 0 0 0 0 0 24
1 6 9 5 3 1 0
2 31 56 48 66 2 0
3 23 42 117 114 11 0
4 12 44 150 198 12 0
5 23 44 158 209 20 0
6 68 78 131 71 24 0
df_nums %>% xtabs(~q4r10+q11,.,addNA = T) q11
q4r10 1 2 3 4 5 6
0 156 254 564 595 67 24
1 7 19 45 66 3 0
df_nums %>% xtabs(~q4r11+q11,.,addNA = T) q11
q4r11 1 2 3 4 5 6
0 148 267 606 661 65 8
1 15 6 3 0 5 16
# RULE (A): IF q4r11=TRUE, it means you have no apps. So that means q11 has to equal 6, i.e. NONE. Data shows this is violated. Correcting using this rule.
df_nums$q11[df_nums$q4r11==TRUE] <- 6
df_nums %>% xtabs(~q4r11+q11,.,addNA = T) q11
q4r11 1 2 3 4 5 6
0 148 267 606 661 65 8
1 0 0 0 0 0 45
Since q11 can be ordinal, “none” should equal 0 instead of 6 to preserve ordinality
# RULE (B): Set q11=6 to q11=0
df_nums$q11[df_nums$q11==6] <- 0
df_nums %>% xtabs(~q11,., addNA = T)q11
0 1 2 3 4 5
53 148 267 606 661 65
‘Dont know’ doesn’t add value. Perhaps these can be set to NA and then imputed using mice.
# RULE (C): Set q11 Dont know to NA
df_nums$q11[df_nums$q11==0] <- NA
df_nums %>% xtabs(~q11,., addNA = T)q11
1 2 3 4 5 <NA>
148 267 606 661 65 53
# RULE (D): All NA values for q12 set to 6, since I'm approximating that if you don't have any apps, might as well could as free?
df_nums$q12[is.na(df_nums$q12)] <- 6
df_nums %>% xtabs(~q12,., addNA = T)q12
0 1 2 3 4 5 6
24 24 203 307 416 454 372
map_int(df_nums,~sum(is.na(.x))) %>% sort() %>% tail(3) %>% barchart(main='Missing values')micemap_df(df_nums,~as.factor(.x)) %>%
mice::mice(printFlag = T, m = 5, method = 'rf') -> miceFit
iter imp variable
1 1 q11 q57
1 2 q11 q57
1 3 q11 q57
1 4 q11 q57
1 5 q11 q57
2 1 q11 q57
2 2 q11 q57
2 3 q11 q57
2 4 q11 q57
2 5 q11 q57
3 1 q11 q57
3 2 q11 q57
3 3 q11 q57
3 4 q11 q57
3 5 q11 q57
4 1 q11 q57
4 2 q11 q57
4 3 q11 q57
4 4 q11 q57
4 5 q11 q57
5 1 q11 q57
5 2 q11 q57
5 3 q11 q57
5 4 q11 q57
5 5 q11 q57
unknown timezone 'default/America/Indiana/Indianapolis'Number of logged events: 25
df_nums <- tbl_df(mice::complete(miceFit))# questions_to_consolidate <- c('q13','q24','q25','q26')
# plot_bi_plots(df_nums %>% dplyr::select(contains(questions_to_consolidate[1])),questions_to_consolidate[1])
# plot_bi_plots(df_nums %>% dplyr::select(contains(questions_to_consolidate[2])),questions_to_consolidate[2], xlim=c(0,.05), ylim=c(-0.01,0.002))
# plot_bi_plots(df_nums %>% dplyr::select(contains(questions_to_consolidate[3])),questions_to_consolidate[3])
# plot_bi_plots(df_nums %>% dplyr::select(contains(questions_to_consolidate[4])),questions_to_consolidate[4], xlim=c(0,0.05),ylim=c(-0.011,0.005))df_nums <- map_df(df_nums,~as.numeric(as.character(.x)))
df_nums_adj <- df_nums %>%
adjust_q13() %>%
adjust_q24() %>%
adjust_q25() %>%
adjust_q26() %>%
adjust_q2() %>%
adjust_q4() %>%
adjust_race() %>%
adjust_gender() %>%
adjust_marital() %>%
adjust_income() %>%
adjust_age() %>%
adjust_q11() %>%
adjust_names() %>%
adjust_children()package ‘bindrcpp’ was built under R version 3.4.4
glimpse(df_nums_adj)Observations: 1,800
Variables: 43
$ q1_age <dbl> 1, 1, 3, 1, 2, 1, 2, 2, 2, 2, 2, 1, 1, 3, 1, 3, 2, 2, 1, 2, 1, 2, 4, 3, 2, 1, 1, 2, 2, 1, 2, 3, 2, 3, 1, 1, 2, 1, 1, 2, 2, 1, 2, 3, 2, 4, 2,...
$ q11_appnum <dbl> 4, 3, 3, 4, 3, 4, 4, 3, 1, 2, 3, 2, 4, 4, 4, 3, 4, 3, 4, 4, 2, 4, 1, 4, 4, 1, 3, 3, 3, 4, 3, 4, 3, 1, 2, 3, 2, 4, 4, 0, 4, 3, 4, 3, 4, 4, 3,...
$ q12_freeapppc <dbl> 5, 5, 6, 4, 6, 3, 3, 6, 5, 2, 5, 6, 2, 3, 5, 4, 2, 2, 3, 5, 3, 6, 2, 5, 4, 3, 4, 0, 4, 4, 3, 4, 5, 6, 6, 2, 6, 3, 5, 5, 3, 4, 2, 6, 4, 2, 6,...
$ q48_edu <dbl> 3, 4, 3, 4, 4, 1, 3, 4, 3, 3, 4, 3, 4, 3, 4, 3, 2, 4, 6, 6, 1, 5, 3, 4, 6, 1, 3, 2, 2, 3, 3, 3, 3, 4, 3, 2, 3, 3, 4, 2, 4, 3, 6, 4, 3, 4, 4,...
$ q49_marital <dbl> 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1,...
$ q50r1_nochild <dbl> 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0,...
$ q56_income <dbl> 2, 3, 4, 2, 2, 1, 1, 3, 2, 2, 2, 2, 4, 3, 2, 4, 3, 2, 2, 2, 2, 4, 3, 3, 5, 1, 2, 2, 1, 2, 3, 3, 2, 3, 1, 2, 3, 1, 1, 2, 2, 2, 3, 1, 4, 5, 3,...
$ q57_mf <dbl> 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1,...
$ q13_visitfreq_social <dbl> 2.50, 3.00, 3.25, 4.00, 2.50, 4.00, 2.50, 2.25, 2.75, 2.00, 2.75, 3.25, 1.25, 3.25, 2.00, 2.25, 2.75, 2.00, 2.75, 3.25, 3.25, 3.25, 4.00, 2....
$ q13_visitfreq_music <dbl> 3.75, 3.50, 2.75, 3.25, 3.50, 3.50, 4.00, 2.50, 2.00, 1.25, 3.50, 3.25, 1.75, 3.25, 2.50, 3.75, 2.50, 2.75, 4.00, 3.25, 2.50, 2.50, 2.75, 3....
$ q13_visitfreq_video <dbl> 2.50, 2.00, 2.50, 2.50, 1.75, 2.25, 3.25, 3.25, 2.00, 1.25, 3.50, 2.00, 1.50, 3.25, 2.25, 3.00, 1.75, 2.00, 3.25, 1.75, 3.25, 2.00, 3.25, 2....
$ q24_tech_posatt <dbl> 2.8, 2.8, 3.0, 3.0, 2.2, 3.4, 2.8, 2.0, 2.6, 2.8, 2.2, 2.0, 1.4, 2.6, 2.8, 2.2, 1.0, 3.6, 1.0, 2.6, 1.0, 3.2, 4.4, 2.6, 2.0, 2.2, 1.0, 2.4, ...
$ q24_tech_enter <dbl> 2.0, 2.0, 1.5, 3.0, 2.5, 1.0, 2.5, 2.0, 3.0, 2.5, 2.5, 1.0, 1.5, 2.0, 1.5, 1.5, 1.0, 1.5, 2.5, 2.5, 1.0, 1.0, 4.5, 2.0, 2.5, 3.0, 1.0, 4.5, ...
$ q24_tech_comm <dbl> 2.666667, 1.333333, 2.666667, 3.666667, 3.333333, 1.000000, 3.000000, 2.333333, 3.000000, 2.333333, 4.333333, 1.000000, 2.000000, 1.666667, ...
$ q24_tech_negatv <dbl> 1.0, 3.0, 4.5, 3.0, 1.5, 4.5, 1.5, 4.0, 4.5, 3.0, 5.5, 6.0, 2.0, 5.0, 3.5, 5.0, 1.0, 2.0, 4.0, 3.5, 1.0, 4.0, 4.0, 3.5, 2.0, 3.0, 1.0, 2.5, ...
$ q25_prsnlty_leader <dbl> 1.50, 2.75, 2.50, 2.00, 3.25, 3.00, 3.00, 3.25, 2.75, 2.00, 2.00, 1.00, 1.25, 2.25, 1.50, 2.25, 1.25, 2.50, 1.75, 2.50, 2.75, 1.50, 4.75, 3....
$ q25_prsnlty_risk <dbl> 1.666667, 2.333333, 3.333333, 2.666667, 3.666667, 3.000000, 1.666667, 3.000000, 3.000000, 2.333333, 2.666667, 1.333333, 1.333333, 3.000000, ...
$ q25_prsnlty_drive <dbl> 2.75, 2.25, 2.00, 3.25, 3.00, 2.50, 2.50, 2.25, 2.75, 2.25, 3.00, 1.00, 1.75, 3.00, 1.50, 3.00, 2.25, 2.50, 1.50, 1.00, 2.25, 2.50, 2.00, 3....
$ q25_prsnlty_follower <dbl> 5, 5, 6, 5, 4, 4, 2, 5, 5, 2, 6, 5, 2, 6, 5, 6, 3, 1, 4, 5, 3, 5, 6, 6, 2, 5, 1, 5, 5, 3, 4, 6, 4, 6, 6, 2, 5, 3, 4, 5, 5, 5, 6, 6, 6, 5, 3,...
$ q26_shopsavvy_bargain <dbl> 2.5, 3.5, 3.0, 3.0, 3.0, 3.0, 3.5, 1.5, 3.5, 1.5, 2.5, 2.0, 1.5, 3.5, 3.5, 2.5, 1.0, 3.0, 3.5, 2.0, 3.5, 1.5, 3.5, 3.5, 2.5, 3.5, 1.0, 3.5, ...
$ q26_shopsavvy_brands <dbl> 3.4, 3.2, 3.4, 3.0, 4.0, 3.2, 2.8, 3.8, 5.4, 1.4, 4.4, 3.6, 2.0, 3.6, 2.8, 4.6, 1.4, 2.0, 2.0, 3.0, 3.6, 4.6, 4.0, 4.4, 2.0, 4.6, 1.0, 5.8, ...
$ q26_shopsavvy_earn2spend <dbl> 2.666667, 3.333333, 3.666667, 3.666667, 4.000000, 4.000000, 5.000000, 3.000000, 5.333333, 1.000000, 3.666667, 3.666667, 2.333333, 4.333333, ...
$ q26_shopsavvy_applover <dbl> 4.166667, 3.000000, 4.166667, 2.166667, 3.666667, 2.166667, 3.000000, 4.000000, 4.833333, 1.666667, 4.166667, 3.000000, 2.000000, 2.666667, ...
$ q26_shopsavvy_children <dbl> 6, 6, 4, 6, 3, 5, 5, 6, 5, 1, 6, 6, 1, 3, 6, 2, 1, 3, 1, 4, 5, 6, 3, 5, 2, 5, 1, 6, 5, 5, 3, 4, 2, 3, 6, 2, 6, 6, 3, 4, 5, 3, 5, 3, 6, 5, 1,...
$ q2_apple <dbl> 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0,...
$ q2_andriod <dbl> 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,...
$ q2_windows <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,...
$ q2_tablet <dbl> 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,...
$ q2_other <dbl> 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,...
$ q4_use_music_apps <dbl> 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
$ q4_use_tv_apps <dbl> 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0,...
$ q4_use_game_apps <dbl> 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0,...
$ q4_use_social_apps <dbl> 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
$ q4_use_news_apps <dbl> 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1,...
$ q4_use_shop_apps <dbl> 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1,...
$ q4_use_none_apps <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ q54_white <dbl> 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
$ q54_black <dbl> 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ q54_asian <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ q54_hawai <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ q54_native <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ q54_other <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ q55_latino <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
set1 <- c(
'q1_age',
# 'q11_appnum',
# 'q12_freeapppc',
'q48_edu',
'q49_marital',
'q50r1_nochild',
'q56_income',
# 'q57_mf',
# 'q13_visitfreq_social',
# 'q13_visitfreq_music',
# 'q13_visitfreq_video',
'q24_tech_posatt',
'q24_tech_enter',
'q24_tech_comm',
'q24_tech_negatv',
# 'q25_prsnlty_leader',
# 'q25_prsnlty_risk',
# 'q25_prsnlty_drive',
# 'q25_prsnlty_follower',
'q26_shopsavvy_bargain',
'q26_shopsavvy_brands',
'q26_shopsavvy_earn2spend',
'q26_shopsavvy_applover',
'q26_shopsavvy_children',
'q2_apple',
'q2_andriod',
'q2_windows',
# 'q2_tablet',
# 'q2_other',
# 'q4_use_music_apps',
# 'q4_use_tv_apps',
# 'q4_use_game_apps',
# 'q4_use_social_apps',
# 'q4_use_news_apps',
# 'q4_use_shop_apps',
# 'q4_use_none_apps',
'q54_white',
'q54_black',
'q54_asian',
'q54_hawai',
'q54_native',
# 'q54_other',
'q55_latino'
)
set2 <- c(
'q1_age',
# 'q11_appnum',
'q12_freeapppc',
'q48_edu',
'q49_marital',
'q50r1_nochild',
'q56_income',
# 'q57_mf',
'q13_visitfreq_social',
'q13_visitfreq_music',
'q13_visitfreq_video',
'q24_tech_posatt',
'q24_tech_enter',
'q24_tech_comm',
'q24_tech_negatv',
'q25_prsnlty_leader',
'q25_prsnlty_risk',
'q25_prsnlty_drive',
'q25_prsnlty_follower',
'q26_shopsavvy_bargain',
'q26_shopsavvy_brands',
'q26_shopsavvy_earn2spend',
'q26_shopsavvy_applover',
'q26_shopsavvy_children',
# 'q2_apple',
# 'q2_andriod',
# 'q2_windows',
# 'q2_tablet',
# 'q2_other',
# 'q4_use_music_apps',
# 'q4_use_tv_apps',
# 'q4_use_game_apps',
# 'q4_use_social_apps',
# 'q4_use_news_apps',
# 'q4_use_shop_apps',
# 'q4_use_none_apps',
'q54_white',
'q54_black',
'q54_asian',
'q54_hawai',
'q54_native',
# 'q54_other',
'q55_latino'
)
set3 <- c(
'q1_age',
'q11_appnum',
'q12_freeapppc',
'q48_edu',
'q49_marital',
'q50r1_nochild',
'q56_income',
'q57_mf',
# 'q13_visitfreq_social',
# 'q13_visitfreq_music',
# 'q13_visitfreq_video',
# 'q24_tech_posatt',
# 'q24_tech_enter',
# 'q24_tech_comm',
# 'q24_tech_negatv',
# 'q25_prsnlty_leader',
# 'q25_prsnlty_risk',
# 'q25_prsnlty_drive',
# 'q25_prsnlty_follower',
'q26_shopsavvy_bargain',
'q26_shopsavvy_brands',
'q26_shopsavvy_earn2spend',
'q26_shopsavvy_applover',
'q26_shopsavvy_children',
'q2_apple',
'q2_andriod',
'q2_windows',
'q2_tablet',
'q2_other',
'q4_use_music_apps',
'q4_use_tv_apps',
'q4_use_game_apps',
'q4_use_social_apps',
'q4_use_news_apps',
'q4_use_shop_apps',
'q4_use_none_apps'
# 'q54_white',
# 'q54_black',
# 'q54_asian',
# 'q54_hawai',
# 'q54_native',
# 'q54_other',
# 'q55_latino'
)
set4 <- c(
'q1_age',
# 'q11_appnum',
# 'q12_freeapppc',
# 'q48_edu',
'q49_marital',
'q50r1_nochild',
'q56_income',
'q57_mf',
'q13_visitfreq_social',
'q13_visitfreq_music',
'q13_visitfreq_video',
'q24_tech_posatt',
'q24_tech_enter',
'q24_tech_comm',
'q24_tech_negatv',
'q25_prsnlty_leader',
'q25_prsnlty_risk',
'q25_prsnlty_drive',
'q25_prsnlty_follower',
'q26_shopsavvy_bargain',
'q26_shopsavvy_brands',
'q26_shopsavvy_earn2spend',
'q26_shopsavvy_applover',
'q26_shopsavvy_children'
# 'q2_apple',
# 'q2_andriod',
# 'q2_windows',
# 'q2_tablet',
# 'q2_other',
# 'q4_use_music_apps',
# 'q4_use_tv_apps',
# 'q4_use_game_apps',
# 'q4_use_social_apps',
# 'q4_use_news_apps',
# 'q4_use_shop_apps',
# 'q4_use_none_apps'
# 'q54_white',
# 'q54_black',
# 'q54_asian',
# 'q54_hawai',
# 'q54_native',
# 'q54_other',
# 'q55_latino'
)df_nums_adj_backup <- df_nums_adjdf_nums_adj <- df_nums_adj_backupdf_nums_adj <- df_nums_adj %>% dplyr::select(one_of(set4))scaling_wanted <- T
minmax_wanted <- F
if (scaling_wanted) {
df_nums_adjscaled <- scale(df_nums_adj)
centers <- attributes(df_nums_adjscaled)[[3]]
spreads <- attributes(df_nums_adjscaled)[[4]]
df_nums_adjscaled <- tbl_df(df_nums_adjscaled)
}
if(minmax_wanted){
df_nums_adjscaled <- map_df(df_nums_adj, ~minmax(.x))
}
if(!scaling_wanted & !minmax_wanted){
df_nums_adjscaled <- df_nums_adj
}
head(df_nums_adjscaled)df_nums_adjscaled %>% cor() %>% corrplot(method = 'shade',tl.col = 'black',tl.cex = .9, order = 'hclust', hclust.method = 'ward.D2')# ordered=c(1:4,11,13:28)
# symm_bin=c(6:10,12,29:47)
# df_nums_adjscaled[ordered] <- map_df(df_nums_adjscaled[ordered],~as.ordered(.x))
# df_nums_adjscaled[symm_bin] <- map_df(df_nums_adjscaled[symm_bin],~as.factor(.x))
distMat <- daisy(df_nums_adjscaled)binary variable(s) 2, 3, 5 treated as interval scaled
methods <- c('complete','average','ward.D','ward.D2','median','mcquitty','centroid')
method_vs_cop <- map_dbl(methods,~cor(cophenetic(hclust(d = distMat, method = .x)), distMat))
names(method_vs_cop) <- methods
barchart(sort(method_vs_cop))k = 6
hclustFit <- hclust(d = distMat, method = 'ward.D')
plot(hclustFit, labels = F)
rect.hclust(hclustFit, k=k, border="red")hclust_segments <- cutree(hclustFit, k = k)
table(hclust_segments)hclust_segments
1 2 3 4 5 6
341 412 333 217 126 371
clusplot(df_nums_adjscaled, hclust_segments, color=TRUE, shade=TRUE, labels=4, lines=0, main="HClust plot")seg.summ(df_nums_adj, hclust_segments) -> centroids
centroidscutFit <- cutree(hclustFit, k)
plot(silhouette(cutFit,distMat))k <- 2:10
widths <- NULL
for(k_ in k){
hclustFit <- hclust(d = distMat, method = 'ward.D')
cutFit <- cutree(hclustFit, k_)
widths <- c(widths,mean(silhouette(cutFit,distMat)[,'sil_width']))
}
plot(k,widths,type='b')cbind(k,widths) k widths
[1,] 2 0.13737473
[2,] 3 0.08281616
[3,] 4 0.06855129
[4,] 5 0.06137392
[5,] 6 0.05921532
[6,] 7 0.06267068
[7,] 8 0.06064109
[8,] 9 0.05293374
[9,] 10 0.04584471
wssplot <- function(numsub, nc=15, seed=1111) {
wss <- (nrow(numsub)-1)*sum(apply(numsub,2,var))
for (i in 2:nc) {
set.seed(seed)
wss[i] <- sum(kmeans(numsub, centers=i, iter.max = 1e4)$withinss)}
plot(1:nc, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares")}
wssplot(df_nums_adjscaled)k <- 2:10
widths <- NULL
for(k_ in k){
clusterresults <- kmeans(x = df_nums_adjscaled, centers = k_, nstart = 30)
widths <- c(widths,mean(silhouette(clusterresults$cluster,distMat)[,'sil_width']))
}did not converge in 10 iterations
plot(k,widths,type='b')cbind(k,widths) k widths
[1,] 2 0.15626749
[2,] 3 0.10919911
[3,] 4 0.10576895
[4,] 5 0.10285049
[5,] 6 0.09506744
[6,] 7 0.09259923
[7,] 8 0.08744827
[8,] 9 0.08373856
[9,] 10 0.08123470
# Using cluster centers from hclust:
seg.summ(df_nums_adjscaled, hclust_segments) -> kmeans_centroids
clusterresults <- kmeans(x = df_nums_adjscaled, centers = kmeans_centroids[,-1])
rsquare <- clusterresults$betweenss/clusterresults$totss
cat('\nWithin SS:',clusterresults$withinss,' Sizes:\n',clusterresults$size,'\nrsq:', rsquare)
Within SS: 4949.8 5143.576 3741.233 3415.204 2907.519 4511.912 Sizes:
383 395 220 254 223 325
rsq: 0.3470117
clusplot(df_nums_adjscaled, clusterresults$cluster, color=TRUE, shade=TRUE, labels=4, lines=0, main="K-means cluster plot")plot(silhouette(clusterresults$cluster,distMat))purrr::map2(kmeans_results, names(kmeans_results),
~bwplot(cluster~.x, kmeans_results,
main=.y))$q1_age
$q49_marital
$q50r1_nochild
$q56_income
$q57_mf
$q13_visitfreq_social
$q13_visitfreq_music
$q13_visitfreq_video
$q24_tech_posatt
$q24_tech_enter
$q24_tech_comm
$q24_tech_negatv
$q25_prsnlty_leader
$q25_prsnlty_risk
$q25_prsnlty_drive
$q25_prsnlty_follower
$q26_shopsavvy_bargain
$q26_shopsavvy_brands
$q26_shopsavvy_earn2spend
$q26_shopsavvy_applover
$q26_shopsavvy_children
$cluster
pamFits <- tibble(k_pam = 2:10)
pamFits$pamFits <- map(pamFits$k_pam, ~pam(df_nums_adjscaled, k = .x))
pamFits$sil_avg_width <- map_dbl(pamFits$pamFits,~.x$silinfo$avg.width)
pamFitspamFits %>%
xyplot(sil_avg_width~as.factor(k_pam),.,type='b')plot(pamFits$pamFits[[1]])seg.summ(df_nums_adj, pamFits$pamFits[[1]]$clustering)mclustFits <- tibble(mclust_clusters=2:8)
mclustFits$mclustFits <- map(mclustFits$mclust_clusters, ~Mclust(df_nums_adjscaled, G = .x))fitting ...
|
| | 0%
|
|=========== | 7%
|
|====================== | 13%
|
|================================== | 20%
|
|============================================= | 27%
|
|======================================================== | 33%
|
|=================================================================== | 40%
|
|============================================================================== | 47%
|
|========================================================================================== | 53%
|
|===================================================================================================== | 60%
|
|================================================================================================================ | 67%
|
|=========================================================================================================================== | 73%
|
|====================================================================================================================================== | 80%
|
|================================================================================================================================================== | 87%
|
|============================================================================================================================================================= | 93%
|
|========================================================================================================================================================================| 100%
fitting ...
|
| | 0%
|
|=========== | 7%
|
|====================== | 13%
|
|================================== | 20%
|
|============================================= | 27%
|
|======================================================== | 33%
|
|=================================================================== | 40%
|
|============================================================================== | 47%
|
|========================================================================================== | 53%
|
|===================================================================================================== | 60%
|
|================================================================================================================ | 67%
|
|=========================================================================================================================== | 73%
|
|====================================================================================================================================== | 80%
|
|================================================================================================================================================== | 87%
|
|============================================================================================================================================================= | 93%
|
|========================================================================================================================================================================| 100%
fitting ...
|
| | 0%
|
|=========== | 7%
|
|====================== | 13%
|
|================================== | 20%
|
|============================================= | 27%
|
|======================================================== | 33%
|
|=================================================================== | 40%
|
|============================================================================== | 47%
|
|========================================================================================== | 53%
|
|===================================================================================================== | 60%
|
|================================================================================================================ | 67%
|
|=========================================================================================================================== | 73%
|
|====================================================================================================================================== | 80%
|
|================================================================================================================================================== | 87%
|
|============================================================================================================================================================= | 93%
|
|========================================================================================================================================================================| 100%
fitting ...
|
| | 0%
|
|=========== | 7%
|
|====================== | 13%
|
|================================== | 20%
|
|============================================= | 27%
|
|======================================================== | 33%
|
|=================================================================== | 40%
|
|============================================================================== | 47%
|
|========================================================================================== | 53%
|
|===================================================================================================== | 60%
|
|================================================================================================================ | 67%
|
|=========================================================================================================================== | 73%
|
|====================================================================================================================================== | 80%
|
|================================================================================================================================================== | 87%
|
|============================================================================================================================================================= | 93%
|
|========================================================================================================================================================================| 100%
fitting ...
|
| | 0%
|
|=========== | 7%
|
|====================== | 13%
|
|================================== | 20%
|
|============================================= | 27%
|
|======================================================== | 33%
|
|=================================================================== | 40%
|
|============================================================================== | 47%
|
|========================================================================================== | 53%
|
|===================================================================================================== | 60%
|
|================================================================================================================ | 67%
|
|=========================================================================================================================== | 73%
|
|====================================================================================================================================== | 80%
|
|================================================================================================================================================== | 87%
|
|============================================================================================================================================================= | 93%
|
|========================================================================================================================================================================| 100%
fitting ...
|
| | 0%
|
|=========== | 7%
|
|====================== | 13%
|
|================================== | 20%
|
|============================================= | 27%
|
|======================================================== | 33%
|
|=================================================================== | 40%
|
|============================================================================== | 47%
|
|========================================================================================== | 53%
|
|===================================================================================================== | 60%
|
|================================================================================================================ | 67%
|
|=========================================================================================================================== | 73%
|
|====================================================================================================================================== | 80%
|
|================================================================================================================================================== | 87%
|
|============================================================================================================================================================= | 93%
|
|========================================================================================================================================================================| 100%
fitting ...
|
| | 0%
|
|=========== | 7%
|
|====================== | 13%
|
|================================== | 20%
|
|============================================= | 27%
|
|======================================================== | 33%
|
|=================================================================== | 40%
|
|============================================================================== | 47%
|
|========================================================================================== | 53%
|
|===================================================================================================== | 60%
|
|================================================================================================================ | 67%
|
|=========================================================================================================================== | 73%
|
|====================================================================================================================================== | 80%
|
|================================================================================================================================================== | 87%
|
|============================================================================================================================================================= | 93%
|
|========================================================================================================================================================================| 100%
mclustFits$bic <- map_dbl(mclustFits$mclustFits, ~.x[['bic']])
mclustFits$loglik <- map_dbl(mclustFits$mclustFits, ~.x[['loglik']])
mclustFitsmclustFits %>%
xyplot(bic+loglik~as.factor(mclust_clusters),., auto.key = T, type = 'b')summary(mclustFits[[4,'mclustFits']])----------------------------------------------------
Gaussian finite mixture model fitted by EM algorithm
----------------------------------------------------
Mclust VII (spherical, varying volume) model with 5 components:
log.likelihood n df BIC ICL
-48851.89 1800 114 -98558.28 -98984.67
Clustering table:
1 2 3 4 5
527 317 305 275 376
mclust_clusters <- mclustFits[[4,'mclustFits']]$classification
clusplot(df_nums_adj, mclust_clusters, color=TRUE, shade=TRUE, labels=4, lines=0, main="M Clust Plot, k = 4")seg.summ(df_nums_adj, mclust_clusters)mclust_distMat <- daisy(df_nums_adj)binary variable(s) 2, 3, 5 treated as interval scaled
plot(silhouette(mclust_clusters,mclust_distMat))